
##################################################################
##### Analysis with unsupervised techniques, EU speeches  ########
##################################################################

# OVERVIEW, what we do here:
# Analyze and visualize stm model with 14 topics
# (for all crucial presteps, e.g. loading and cleaning corpus and robustness tests, see script "Analysis_stm_preprobust" and "Analysis_stm_cleanvalid")

# clean environment
rm(list = ls())

# Set working directory and put the "Data" folder and all files in "RData_for_directory" here!
setwd("")

# load packages
library(dplyr)
library(tidyverse)
library(tidytext)
library(quanteda)
library(stm)
library(stringr)
library(stringi)
library(XML)
library(RCurl)
library(SnowballC)
library(xml2)
library(ggplot2)
library(Rtsne)
library(rsvd)
library(geometry)
library(igraph)
library(stmCorrViz)
library(stargazer)
library(xtable)


############################################

######## Apply unsupervised techniques #####

############################################


############################################
##### STM: Structural Topic Model ##########
############################################

# FIRST: load data and model and find labels for topics
# SECOND: visualize topics and their correlation with speakers 
# THIRD:  do network analysis on correlation among topics

#########################------------------###################################

# FIRST: load model and all other data we need, analyze model, find labels

load("stm_obj_14.RData")
load("meta_stm.Rdata")
load("speaker_unique.Rdata")
load("all_documents.RData")

# plot topic model, simple overview
plot(stm_obj_14, type="summary", xlim=c(0,.4))

# get most representative words in each topic to find most intuitive labels
labelTopics(stm_obj_14, topics = NULL, n = 20, frexweight = 0.5)
# NOTE: this function gives us four different kind of word selection:
# prob: most frequent words
# FREX: most frequent AND exclusive terms
# lift/score: some variants of FREX, see stm vignette


# now, check below for each topic findThoughts (= get most typical texts) to validate/improve label
# NOTE: texts have to be here a character vector where each entry contains the text of a document
# would our sorted (!) supercorpus work here? Yes, subset, texts of supercorpus only, imported as findtexts above
# set topic to specific topic number and identify/evaluate qualitatively the labels for each topic:
#validate_labels <- findThoughts(stm_obj_14, texts = findtexts, topics = 14, n = 3)$docs[[1]]
#validate_labels[1]
#validate_labels[2]
#validate_labels[3]

# define validated labels as vector to be included in all key graphs
topiclabs14 <- c("Financial Crisis, Catalonia","Business and Trade",  "Development, Agriculture", 
                 "EU Crises", "Sports, Honor, National Pride",
                 "Public Sector", "Neighborhood Conflicts",
                 "Collective Memory", "Globalization, Climate", "Religion, Authoritarianism", "Juche, Military",
                 "Corruption, Crime", "(Anti) Immigration", "EU Agreements")


#########################-----------------####################################

# SECOND: visualizations


# FIGURE 2 in paper 

# before we plot, we run a regression to estimate the relationships between topics and metadata (here: speaker) 
# we take speaker because we want to see what proportions of topics the speeches of each speaker contain
# and what kind of topics which speakers share
rel_stm_obj_14 <- estimateEffect(1:14 ~speaker, stm_obj_14, meta)

# now, write function to plot all estimated topic proportions in a loop one after the other (not included in stm package)
# including the 10 FREX words for each topic in the subtitle
plot_speakertopics <- function(x,y){
  for (i in 1:length(x$topics)){
    keywords <- labelTopics(y, topics = NULL, n = 10, frexweight = 0.5)
    frex <- keywords$frex[i,]
    frex <- paste(frex, collapse = ", ")
    frex[[i]] <- paste("Expected Proportion of Topic ", i, ": ", frex, collapse = "/n")
    par(cex=0.65, cex.lab = 1.4, cex.main = 2,
        mar = c(5,5,5,5))
    plot <- plot(x, covariate= "speaker",  
                 model=y,
                 topics = x$topics[i],
                 method = "pointestimate",
                 main = "Estimates for Topics per Speaker",
                 xlab=frex[i],
                 xlim = c(-0.2, 0.9),
                 ylab="Speaker",
                 labeltype = "custom",
                 custom.labels = speaker_unique)
    allplots <- list()
    allplots[[i]] <- plot
    dev.off
  }
}

# make use of my nice plot function for plotting the regression results with the model
plot_speakertopics(rel_stm_obj_14, stm_obj_14)


# rewrite my plot function, this time with labels as suggested above
plot_speakertopics <- function(x,y){
  for (i in 1:length(x$topics)){
    par(cex=0.65, cex.lab = 1.4, cex.main = 2,
        mar = c(5,5,5,5))
    plot <- plot(x, covariate= "speaker",  
                 model=y,
                 topics = x$topics[i],
                 method = "pointestimate",
                 main = "Estimates for Topics per Speaker",
                 xlab=topiclabs14[i],
                 xlim = c(-0.2, 0.9),
                 ylab="Speaker",
                 labeltype = "custom",
                 custom.labels = speaker_unique)
    allplots <- list()
    allplots[[i]] <- plot
    dev.off
  }
}

# plot all 14 topics in a row with labels
plot_speakertopics(rel_stm_obj_14, stm_obj_14)


# plot for latex in better size

par(mar = c(5,11,1,1), mfrow=c(1,1), family = "serif", bty = "n")
plot_speakerlatex <- function(x,y){
  for (i in 1:length(x$topics)){
    plot <- plot(x, covariate= "speaker",  
                 model=y,
                 topics = x$topics[i],
                 method = "pointestimate",
                 xlab=topiclabs14[i],
                 xlim = c(0, 1),
                 ylim = c(0,40),
                 labeltype = "custom",
                 custom.labels = speaker_unique,
                 cex = 2,
                 cex.lab = 1.2,
                 cex.axis = 1,
                 width = 200)
    allplots <- list()
    allplots[[i]] <- plot
    dev.off
  }
}

plot_speakerlatex(rel_stm_obj_14, stm_obj_14)
#pdf("valid_stm/select_14.pdf")
dev.off()

### TABLE 2 in paper


# compile data frame to be exported as LaTeX table with labels, most frequent and frex words
# (similar to Lucas et al.)
get_words <- labelTopics(stm_obj_14, topics = NULL, n = 10, frexweight = 0.5)
# pull and merge all prob (most frequent words)
for (i in 1:14){
  prob <- data.frame(get_words$prob, stringsAsFactors = FALSE)
  prob <- unite(prob, allprob, c("X1":"X10"), sep = ", ", remove = TRUE)
}

#rlang::last_error()

# pull and merge all frex (most frequent AND exclusive words)
for (i in 1:14){
  frex <- data.frame(get_words$frex, stringsAsFactors = FALSE)
  frex <- unite(frex, allfrex, c("X1":"X10"), sep = ", ", remove = TRUE)
}

# merge them into one data frame together with topiclabs14
tabletopics <- data.frame(topiclabs14, prob, frex)
# merge both word colums
tabletopics <- unite(tabletopics, merger, c("allprob", "allfrex"), sep = " newline ", remove = TRUE)

# ok, now export as LaTeX table
stargazer(tabletopics, summary = FALSE, rownames = FALSE)


# now plot also with covariate = scaleideo as continuous variable as per our dictionary results
# to see which topics are more liberal or illiberal according to our dictionary results
# for this, get estimateEffects with covariate = scaleideo 
rel_scale_14 <- estimateEffect(1:14 ~scaleideo, stm_obj_14, meta)

# and rewrite my plot function for this:
plot_autodemotopics <- function(x,y){
  par(cex=0.65, cex.lab = 1.4, cex.main = 2,
      mar = c(5,5,5,5))
  plot <- plot(x, covariate= "scaleideo",  
               model=y,
               topics = x$topics,
               method = "difference",
               cov.value1 = 5,
               cov.value2 = -5,
               main = "Estimates for Topic Proportions in Speeches of Political Leaders",
               xlab="Illiberal vs Liberal Speakers as per Dictionary Results",
               xlim = c(-1, 1),
               ylab="Topics",
               labeltype = "custom",
               custom.labels = topiclabs14,
               verbose.labels = "")
  #custom.labels = prob)
}
plot_autodemotopics(rel_scale_14, stm_obj_14)


# now rewrite my plot function again, simplified for LaTeX table
plot_latex <- function(x,y){
  par(cex=0.65, cex.lab = 1.4, cex.main = 2,
      mar = c(2,0,0,0), bty = "n")
  plot <- plot(x, covariate= "scaleideo",  
               model=y,
               topics = x$topics,
               method = "difference",
               cov.value1 = 5,
               cov.value2 = -5,
               xlim = c(-0.55, 0.55),
               xlab = "",
               labeltype = "custom",
               custom.labels = "",
               verbose.labels = "")
}

plot_latex(rel_scale_14, stm_obj_14)



#########################-----------------####################################

# THIRD: network analysis

# take our model of 14 topics and do something similar to Lucas et al (2015):
# 1. Color topic nodes: red = illiberal, green = liberal
# 2. Change thickness of edge as per strength of correlation between topics
# 3. Change size of node as per proportion of topic in overall corpus

# for this, make use of instructions from Lucas et al.'s (2015) replication scripts:

# first, get the topic proportions in corpus to determine the size of the nodes
# This is the matrix of Correlations Between Topics:
cor(stm_obj_14$theta)

# Size of Topic: Size depends on how you calculate it.  
# stm_obj_14$theta is a D-by-K matrix (so in der Art: D = Diagonal, K = Gesamtsumme)
# with document d in 1:D and its loading onto each topic  
# If you want frequency by word tokens then you just have to 
# multiply through the word counts within each document.
head(stm_obj_14$theta)
# Lucas uses this vector of wordcounts - yet, there was typo in his script (sum(2) not sum[2,])
wordcounts <- unlist(lapply(stm_obj_14$theta, function(x) sum(2)))

# there are fractional wordcounts due to variational approximation.
round(stm_obj_14$theta[,1] * wordcounts,2)

# Calculate the proportion of words devoted to topics
topicPropsInCorpus <- rep(NA,14)
for(i in 1:14){
  topicPropsInCorpus[i] <- (sum(stm_obj_14$theta[,i] * wordcounts))/sum(wordcounts)
}
# This now holds the topic proportions in the corpus
topicPropsInCorpus
# sums to one, as it should
sum(topicPropsInCorpus)
# add the topic labels
names(topicPropsInCorpus) <- topiclabs14
# ok

# now, plot the network using a non-binary distance matrix
mat2 <- cor(stm_obj_14$theta)
# setting the negatives to zero
mat2[mat2<0] <- 0
# setting the diagonal to zero
diag(mat2) <- 0
# this gives us positive correlations between topics
mat2
# rename this object as "out"
out <- mat2

# set a seed so that it's reproducable
set.seed(2138)
# make a graph object called g
g <- graph.adjacency(out, mode="undirected", weighted=T)
if(length(labels)==0) labels = paste("Topic", topics)

# make the edges thickness proportional to correlation 
# (I need them thicker than Lucas et al., put 90* instead of 35*)
cor(stm_obj_14$theta)[,1]
E(g)
edges <- get.edgelist(g)
edgecors <- rep(NA,nrow(edges))
for(i in 1:nrow(edges)){
  edgecors[i] <- cor(stm_obj_14$theta)[edges[i,1],edges[i,2]]
}
edge.width=90*edgecors

# look at and set other graph parameters
E(g)$weight
E(g)$size <- 0.5
E(g)$lty <- 1
E(g)$color <- "black"

#Make the colors indicate the direction of the coefficient
# get the estimates
prep <- plot.estimateEffect(rel_scale_14, covariate= "scaleideo",  
                            model=stm_obj_14,
                            method = "difference",
                            cov.value1 = 5,
                            cov.value2 = -5,
                            main = "Estimates for Topic Proportions in Speeches of Political Leaders",
                            xlab="Illiberal vs Liberal Speakers as per Dictionary Results",
                            xlim = c(-0.4, 0.4),
                            ylab="Topics",
                            labeltype = "custom",
                            custom.labels = topiclabs14,
                            verbose.labels = "")

est <- unlist(lapply(prep$means,function(x){return(x[1])}))
est

# get colors, make them green (liberal) and red (illiberal), similar to dictionary scale
mycols <- rev(colorRampPalette(c("green", "white", "red"), bias=1)( 14 )) ## (n)
mycols
# assign the color category for each coeff (set range as per coeff. range, length = topic numbers)
seq(-0.4,0.4,length.out=14)
colcat <- rep(NA,length(est))
for(i in 1:length(colcat)){
  colcat[i] <- max(which(est[i] > seq(-0.4,0.4,length.out=14)))
}
# These are now the color category for each coefficient
colcat

# and these are the associated colors
mycols[colcat]
# This checks to make sure the colors are working as I expect
# Green should be on the left and red on the right.
plot(est,1:14,pch=19,cex=2,col=mycols[colcat]);abline(v=0)
# ok

# label the vertices
V(g)$label=topiclabs14
# set the size of vertices proportional to the proportion in the corpus
V(g)$size <- topicPropsInCorpus*400
# set the color of vertices
vertex.color = mycols[colcat]
# set other vertex characteristics
vertex.label.cex = 0.9
vertex.label.color = "black"
# set the edge color
edge.color = "gray60"
# set a seet so that the layout is reproduceable
set.seed(333)
# pull out the weights to include in the layout
wts <- E(g)$weight
# make the layout
mylayout <- layout.fruchterman.reingold(g,weight=wts)
# start the image file
#pdf("Pics/Network.pdf", 8,8)
# do the plot
par(mar=c(1,1,1,1))
plot(g, layout=mylayout,edge.color=edge.color,vertex.color=vertex.color, vertex.label.cex=vertex.label.cex, vertex.label.color=vertex.label.color,edge.width=edge.width)
# close the image file
dev.off()
# end of script.

